#Importando as bibliotecas utilizadas
import pandas as pd
import numpy as np
import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score, precision_score, f1_score, recall_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPClassifier
import seaborn as sns
import matplotlib.pyplot as plt
Lendo o dataset disponibilizado
df = pd.read_csv('cancellation_prediction.csv')
df
| type | cancellation | days_between_booking_arrival | year_arrival_date | month_arrival_date | week_number_arrival_date | day_of_month_arrival_date | num_weekend_nights | num_workweek_nights | num_adults | ... | num_previous_stays | reserved_room | changes_between_booking_arrival | deposit_policy | id_travel_agency_booking | id_person_booking | customer_type | avg_price | required_car_parking_spaces | total_of_special_requests | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Fancy Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | 0 | C | 3 | No Deposit | NaN | NaN | 0 | 0.00 | 0 | 0 |
| 1 | Fancy Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | 0 | C | 4 | No Deposit | NaN | NaN | 0 | 0.00 | 0 | 0 |
| 2 | Fancy Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | 0 | A | 0 | No Deposit | NaN | NaN | 0 | 75.00 | 0 | 0 |
| 3 | Fancy Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | 0 | A | 0 | No Deposit | 304.0 | NaN | 0 | 75.00 | 0 | 0 |
| 4 | Fancy Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | ... | 0 | A | 0 | No Deposit | 240.0 | NaN | 0 | 98.00 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 119385 | Hotel | 0 | 23 | 2017 | August | 35 | 30 | 2 | 5 | 2 | ... | 0 | A | 0 | No Deposit | 394.0 | NaN | 0 | 96.14 | 0 | 0 |
| 119386 | Hotel | 0 | 102 | 2017 | August | 35 | 31 | 2 | 5 | 3 | ... | 0 | E | 0 | No Deposit | 9.0 | NaN | 0 | 225.43 | 0 | 2 |
| 119387 | Hotel | 0 | 34 | 2017 | August | 35 | 31 | 2 | 5 | 2 | ... | 0 | D | 0 | No Deposit | 9.0 | NaN | 0 | 157.71 | 0 | 4 |
| 119388 | Hotel | 0 | 109 | 2017 | August | 35 | 31 | 2 | 5 | 2 | ... | 0 | A | 0 | No Deposit | 89.0 | NaN | 0 | 104.40 | 0 | 0 |
| 119389 | Hotel | 0 | 205 | 2017 | August | 35 | 29 | 2 | 7 | 2 | ... | 0 | A | 0 | No Deposit | 9.0 | NaN | 0 | 151.20 | 0 | 2 |
119390 rows × 28 columns
Exibindo as caracteristicas do dataset
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 119390 entries, 0 to 119389 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 type 119390 non-null object 1 cancellation 119390 non-null int64 2 days_between_booking_arrival 119390 non-null int64 3 year_arrival_date 119390 non-null int64 4 month_arrival_date 119390 non-null object 5 week_number_arrival_date 119390 non-null int64 6 day_of_month_arrival_date 119390 non-null int64 7 num_weekend_nights 119390 non-null int64 8 num_workweek_nights 119390 non-null int64 9 num_adults 119390 non-null int64 10 num_children 119386 non-null float64 11 num_babies 119390 non-null int64 12 breakfast 119390 non-null bool 13 country 118902 non-null object 14 market_segment 119390 non-null int64 15 distribution_channel 119390 non-null int64 16 repeated_guest 119390 non-null int64 17 num_previous_cancellations 119390 non-null int64 18 num_previous_stays 119390 non-null int64 19 reserved_room 119390 non-null object 20 changes_between_booking_arrival 119390 non-null int64 21 deposit_policy 119390 non-null object 22 id_travel_agency_booking 103050 non-null float64 23 id_person_booking 6797 non-null float64 24 customer_type 119390 non-null int64 25 avg_price 119390 non-null float64 26 required_car_parking_spaces 119390 non-null int64 27 total_of_special_requests 119390 non-null int64 dtypes: bool(1), float64(4), int64(18), object(5) memory usage: 24.7+ MB
As covariaveis não possuem dados faltantes significativos, com exceção das variaveis 'id_travel_agency_booking' e 'id_person_booking'. Portanto, devido a natureza dos dados que possuem elementos NA será usado um valor/string especifica para representar essa ocorrência
# Testes para verificar o tipo de resposta encontra nas colunas 'id_travel_agency_booking' e 'id_person_booking'
#df.loc[:,'id_travel_agency_booking'].unique()
#df.loc[:,'id_person_booking'].unique()
# Modificando os valores NA de cada variavel
df.loc[:,'country'].fillna('Not informed', inplace = True)
df.loc[:,'num_children'].fillna(0, inplace = True)
df.loc[:,'id_travel_agency_booking'].fillna(0, inplace = True)
df.loc[:,'id_person_booking'].fillna(0, inplace = True)
df.num_children = df.num_children.astype(int)
# Conferindo estrutura do dataset após a mudança
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 119390 entries, 0 to 119389 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 type 119390 non-null object 1 cancellation 119390 non-null int64 2 days_between_booking_arrival 119390 non-null int64 3 year_arrival_date 119390 non-null int64 4 month_arrival_date 119390 non-null object 5 week_number_arrival_date 119390 non-null int64 6 day_of_month_arrival_date 119390 non-null int64 7 num_weekend_nights 119390 non-null int64 8 num_workweek_nights 119390 non-null int64 9 num_adults 119390 non-null int64 10 num_children 119390 non-null int32 11 num_babies 119390 non-null int64 12 breakfast 119390 non-null bool 13 country 119390 non-null object 14 market_segment 119390 non-null int64 15 distribution_channel 119390 non-null int64 16 repeated_guest 119390 non-null int64 17 num_previous_cancellations 119390 non-null int64 18 num_previous_stays 119390 non-null int64 19 reserved_room 119390 non-null object 20 changes_between_booking_arrival 119390 non-null int64 21 deposit_policy 119390 non-null object 22 id_travel_agency_booking 119390 non-null float64 23 id_person_booking 119390 non-null float64 24 customer_type 119390 non-null int64 25 avg_price 119390 non-null float64 26 required_car_parking_spaces 119390 non-null int64 27 total_of_special_requests 119390 non-null int64 dtypes: bool(1), float64(3), int32(1), int64(18), object(5) memory usage: 24.3+ MB
Após fazer um breve tratamento dos dados, será feito agora uma checagem em relação a correlação das variaveis, no intuito de identificar variaveis que na pratica representam a mesma informação ou comportamento
figura = plt.figure(figsize=(20,15))
sns.heatmap(df.corr(), annot=True);
Como não foi encontrado correlação expressiva com nenhum par de covariaveis, não será descartado nenhuma varivel no modelo
Faremos primeiramente uma analise da relação entre o tipo do Hotel e o cancelamento da estadia
df_aux = df.loc[:,['type','cancellation']]
pivot_df = df_aux.pivot_table(index='type', aggfunc={'cancellation':np.sum})
pivot_df
| cancellation | |
|---|---|
| type | |
| Fancy Hotel | 11122 |
| Hotel | 33102 |
fig = go.Figure(data=[go.Bar(x = pivot_df.index, y = pivot_df.cancellation, textposition='auto')])
fig.update_traces(marker_color='rgb(230, 10, 10)', marker_line_color='rgb(85, 10, 10)',
marker_line_width=1.5, opacity=0.6)
fig.update_layout(yaxis_title = 'Number of cancellations', xaxis_title = 'Type', template="plotly_white")
fig.show()
Analisaremos agora a porcentagem que esses cancelamentos representam e cada grupo
mask_hotel = df_aux.loc[:,'type'] == 'Hotel'
mask_fancy = df_aux.loc[:,'type'] == 'Fancy Hotel'
pivot_df.loc['Hotel','percentage_cancel'] = (pivot_df.loc['Hotel','cancellation']/df_aux.loc[mask_hotel].shape[0])*100
pivot_df.loc['Fancy Hotel','percentage_cancel'] = (pivot_df.loc['Fancy Hotel','cancellation']/df_aux.loc[mask_fancy].shape[0])*100
pivot_df
| cancellation | percentage_cancel | |
|---|---|---|
| type | ||
| Fancy Hotel | 11122 | 27.763355 |
| Hotel | 33102 | 41.726963 |
fig = make_subplots(
rows=1, cols=2,
specs=[[{"type": "pie"}, {"type": "pie"}]],
subplot_titles=("Hotel",
"Fancy Hotel")
)
fig.add_trace(go.Pie(labels=['Not Cancelled','Cancelled'],
values=[100 - pivot_df.loc['Hotel','percentage_cancel'],
pivot_df.loc['Hotel','percentage_cancel']],
hoverinfo="label+percent+name",
pull=[0, 0.1],
marker=dict(colors=['darkblue', 'cyan'], line=dict(color='#000000', width=1))),
row = 1, col = 1
)
fig.add_trace(go.Pie(labels=['Not Cancelled','Cancelled'],
values=[100 - pivot_df.loc['Fancy Hotel','percentage_cancel'],
pivot_df.loc['Fancy Hotel','percentage_cancel']],
hoverinfo="label+percent+name",
pull=[0, 0.1],
marker=dict(colors=['darkblue', 'cyan'], line=dict(color='#000000', width=1))),
row = 1, col = 2
)
Podemos observar portanto que apesar da quantidade absoluta de cancelamentos em Hotel é aproximadamente 3 vezes maior que o de Fancy Hotel, temos que essa relação é um pouco menos discrepante quando considerado o total de eventos de ambos os tipos
df_aux = df.loc[:,['num_weekend_nights','num_workweek_nights','cancellation']]
pivot_df = df_aux.pivot_table(index='num_weekend_nights', columns = 'num_workweek_nights',
aggfunc={'cancellation':np.sum}).fillna(0)
pivot_df.columns = [x for x in range(pivot_df.shape[1])]
pivot_df
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| num_weekend_nights | |||||||||||||||||||||
| 0 | 35.0 | 4295.0 | 8437.0 | 4825.0 | 1614.0 | 295.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 976.0 | 2824.0 | 3814.0 | 2121.0 | 809.0 | 463.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 902.0 | 2712.0 | 2623.0 | 1559.0 | 1072.0 | 2969.0 | 331.0 | 162.0 | 116.0 | 28.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 110.0 | 127.0 | 116.0 | 47.0 | 30.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 24.0 | 124.0 | 45.0 | 57.0 | 37.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 6 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 7 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 8 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 9 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 10 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 12 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 13 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 14 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 16 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 18 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 19 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
17 rows × 35 columns
fig = go.Figure(data=[go.Surface(z = pivot_df.values,
x = pivot_df.columns,
y = pivot_df.index)])
fig.update_layout(scene = dict(
xaxis = dict(
title='x:Weekend nights'),
yaxis = dict(
title='y:Workweek nights'),
zaxis = dict(
title='z:Cancellations'),),
)
fig.update_layout(autosize=False,
width=1000,
height=1000,
margin=dict(
l=65,
r=50,
b=65,
t=90)
)
fig.show()
Analisaremos primeiramente as variaveis ('num_adults', 'num_children', 'num_babies') de forma separada, para entender o comportamento de cada uma em relação ao cancelamento
df_aux = df.loc[:,['num_adults','cancellation']]
pivot_df_adults = df_aux.pivot_table(index='num_adults', aggfunc={'cancellation':np.sum})
df_aux = df.loc[:,['num_children','cancellation']]
pivot_df_children = df_aux.pivot_table(index='num_children', aggfunc={'cancellation':np.sum})
df_aux = df.loc[:,['num_babies','cancellation']]
pivot_df_babies = df_aux.pivot_table(index='num_babies', aggfunc={'cancellation':np.sum})
fig = make_subplots(
rows=2, cols=2,
vertical_spacing=0.1,
subplot_titles=("Adults",
"Children",
"Babies")
)
fig.add_trace(go.Scatter(x=pivot_df_adults.index, y=pivot_df_adults.cancellation, mode='lines+markers', name = 'adults'),
row = 1, col = 1
)
fig.add_trace(go.Scatter(x=pivot_df_children.index, y=pivot_df_children.cancellation, mode='lines+markers', name = 'children'),
row = 1, col = 2
)
fig.add_trace(go.Scatter(x=pivot_df_babies.index, y=pivot_df_babies.cancellation, mode='lines+markers', name = 'babies'),
row = 2, col = 1
)
# Update xaxis properties
fig.update_xaxes(title_text="Qte. Adults", row=1, col=1)
fig.update_xaxes(title_text="Qte. Children", row=2, col=1)
fig.update_xaxes(title_text="Qte. Babies", row=1, col=2)
# Update yaxis properties
fig.update_yaxes(title_text="Cancellations", row=1, col=1)
fig.update_yaxes(title_text="Cancellations", row=2, col=1)
fig.update_yaxes(title_text="Cancellations", row=1, col=2)
fig.update_layout(autosize=True,
width=1000,
height=800,
margin=dict(
l=50,
r=50,
b=50,
t=50),
template="plotly_white"
)
Para entender quais são as principais combinações de pessoas que fazem o cancelamento, faremos a analise da combinação das 3 variaveis, obtendo assim a tabela abaixo
df_aux = df.loc[:,['num_adults','num_children','num_babies','cancellation']]
pivot_df_general = df_aux.pivot_table(index=['num_adults','num_children','num_babies'], aggfunc={'cancellation':np.sum})
pivot_df_general.reset_index(inplace = True)
pivot_df_general.sort_values(by = 'cancellation', ascending = False, inplace = True)
pivot_df_general.reset_index(drop = True, inplace = True)
for index, row in pivot_df_general.iterrows():
mask_ad = df_aux.loc[:,'num_adults'] == row[0]
mask_ch = df_aux.loc[:,'num_children'] == row[1]
mask_bb = df_aux.loc[:,'num_babies'] == row[2]
pivot_df_general.loc[index,'percentage_cancel'] = round((pivot_df_general.loc[index,'cancellation']/df_aux.loc[mask_ad & mask_ch & mask_bb].shape[0])*100,2)
pivot_df_general
| num_adults | num_children | num_babies | cancellation | percentage_cancel | |
|---|---|---|---|---|---|
| 0 | 2 | 0 | 0 | 32424 | 39.75 |
| 1 | 1 | 0 | 0 | 6555 | 29.03 |
| 2 | 3 | 0 | 0 | 1930 | 34.06 |
| 3 | 2 | 2 | 0 | 1392 | 43.22 |
| 4 | 2 | 1 | 0 | 1269 | 32.13 |
| 5 | 3 | 1 | 0 | 211 | 43.51 |
| 6 | 2 | 0 | 1 | 127 | 17.94 |
| 7 | 0 | 2 | 0 | 80 | 39.02 |
| 8 | 1 | 1 | 0 | 65 | 23.47 |
| 9 | 1 | 2 | 0 | 47 | 30.32 |
| 10 | 0 | 0 | 0 | 25 | 13.89 |
| 11 | 2 | 1 | 1 | 21 | 15.44 |
| 12 | 4 | 0 | 0 | 15 | 25.42 |
| 13 | 2 | 3 | 0 | 12 | 19.67 |
| 14 | 2 | 2 | 1 | 10 | 37.04 |
| 15 | 3 | 2 | 0 | 9 | 23.08 |
| 16 | 26 | 0 | 0 | 5 | 100.00 |
| 17 | 0 | 3 | 0 | 3 | 27.27 |
| 18 | 1 | 0 | 1 | 3 | 33.33 |
| 19 | 5 | 0 | 0 | 2 | 100.00 |
| 20 | 1 | 3 | 0 | 2 | 50.00 |
| 21 | 1 | 2 | 1 | 2 | 100.00 |
| 22 | 27 | 0 | 0 | 2 | 100.00 |
| 23 | 2 | 0 | 2 | 2 | 16.67 |
| 24 | 20 | 0 | 0 | 2 | 100.00 |
| 25 | 4 | 1 | 0 | 1 | 50.00 |
| 26 | 10 | 0 | 0 | 1 | 100.00 |
| 27 | 6 | 0 | 0 | 1 | 100.00 |
| 28 | 40 | 0 | 0 | 1 | 100.00 |
| 29 | 50 | 0 | 0 | 1 | 100.00 |
| 30 | 0 | 2 | 1 | 1 | 33.33 |
| 31 | 3 | 0 | 1 | 1 | 10.00 |
| 32 | 2 | 10 | 0 | 1 | 100.00 |
| 33 | 55 | 0 | 0 | 1 | 100.00 |
| 34 | 4 | 0 | 1 | 0 | 0.00 |
| 35 | 3 | 1 | 1 | 0 | 0.00 |
| 36 | 0 | 1 | 0 | 0 | 0.00 |
| 37 | 2 | 1 | 2 | 0 | 0.00 |
| 38 | 2 | 0 | 10 | 0 | 0.00 |
| 39 | 1 | 0 | 9 | 0 | 0.00 |
| 40 | 1 | 1 | 1 | 0 | 0.00 |
Temos portanto a relação tanto do numero de cancelamentos em relação a cada combinação de parametro, assim como sua porcentagem de acordo com a amostra disponibilizada.
Como temos covariaveis que possuem informações importantes para o modelo, porém estão em string, será usado o encoding para reorganizar as respostas em valores binários, de forma a identificar se o evento possui ou não aquela caracteristica
enc = OneHotEncoder()
encoded = enc.fit_transform(df[['type', 'month_arrival_date', 'country', 'reserved_room', 'deposit_policy']]).toarray()
enc.categories_
[array(['Fancy Hotel', 'Hotel'], dtype=object),
array(['April', 'August', 'December', 'February', 'January', 'July',
'June', 'March', 'May', 'November', 'October', 'September'],
dtype=object),
array(['ABW', 'AGO', 'AIA', 'ALB', 'AND', 'ARE', 'ARG', 'ARM', 'ASM',
'ATA', 'ATF', 'AUS', 'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BFA',
'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BOL', 'BRA', 'BRB',
'BWA', 'CAF', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 'CN', 'COL',
'COM', 'CPV', 'CRI', 'CUB', 'CYM', 'CYP', 'CZE', 'DEU', 'DJI',
'DMA', 'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ESP', 'EST', 'ETH',
'FIN', 'FJI', 'FRA', 'FRO', 'GAB', 'GBR', 'GEO', 'GGY', 'GHA',
'GIB', 'GLP', 'GNB', 'GRC', 'GTM', 'GUY', 'HKG', 'HND', 'HRV',
'HUN', 'IDN', 'IMN', 'IND', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR',
'ITA', 'JAM', 'JEY', 'JOR', 'JPN', 'KAZ', 'KEN', 'KHM', 'KIR',
'KNA', 'KOR', 'KWT', 'LAO', 'LBN', 'LBY', 'LCA', 'LIE', 'LKA',
'LTU', 'LUX', 'LVA', 'MAC', 'MAR', 'MCO', 'MDG', 'MDV', 'MEX',
'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MOZ', 'MRT', 'MUS', 'MWI',
'MYS', 'MYT', 'NAM', 'NCL', 'NGA', 'NIC', 'NLD', 'NOR', 'NPL',
'NZL', 'Not informed', 'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PLW',
'POL', 'PRI', 'PRT', 'PRY', 'PYF', 'QAT', 'ROU', 'RUS', 'RWA',
'SAU', 'SDN', 'SEN', 'SGP', 'SLE', 'SLV', 'SMR', 'SRB', 'STP',
'SUR', 'SVK', 'SVN', 'SWE', 'SYC', 'SYR', 'TGO', 'THA', 'TJK',
'TMP', 'TUN', 'TUR', 'TWN', 'TZA', 'UGA', 'UKR', 'UMI', 'URY',
'USA', 'UZB', 'VEN', 'VGB', 'VNM', 'ZAF', 'ZMB', 'ZWE'],
dtype=object),
array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'L', 'P'], dtype=object),
array(['No Deposit', 'Non Refund', 'Refundable'], dtype=object)]
df_model_aux = df.copy()
df_model_aux.drop(['type', 'month_arrival_date', 'country', 'reserved_room', 'deposit_policy'], axis = 1, inplace=True)
df_model = pd.concat([df_model_aux, pd.DataFrame(encoded, columns = enc.get_feature_names_out().tolist())], axis=1)
df_model
| cancellation | days_between_booking_arrival | year_arrival_date | week_number_arrival_date | day_of_month_arrival_date | num_weekend_nights | num_workweek_nights | num_adults | num_children | num_babies | ... | reserved_room_D | reserved_room_E | reserved_room_F | reserved_room_G | reserved_room_H | reserved_room_L | reserved_room_P | deposit_policy_No Deposit | deposit_policy_Non Refund | deposit_policy_Refundable | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 342 | 2015 | 27 | 1 | 0 | 0 | 2 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 1 | 0 | 737 | 2015 | 27 | 1 | 0 | 0 | 2 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 2 | 0 | 7 | 2015 | 27 | 1 | 0 | 1 | 1 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 3 | 0 | 13 | 2015 | 27 | 1 | 0 | 1 | 1 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 4 | 0 | 14 | 2015 | 27 | 1 | 0 | 2 | 2 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 119385 | 0 | 23 | 2017 | 35 | 30 | 2 | 5 | 2 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 119386 | 0 | 102 | 2017 | 35 | 31 | 2 | 5 | 3 | 0 | 0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 119387 | 0 | 34 | 2017 | 35 | 31 | 2 | 5 | 2 | 0 | 0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 119388 | 0 | 109 | 2017 | 35 | 31 | 2 | 5 | 2 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 119389 | 0 | 205 | 2017 | 35 | 29 | 2 | 7 | 2 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
119390 rows × 228 columns
Faremos agora a separação do conjunto de teste e treino, de forma aleatoria, com 20% do conjunto destinado a teste
train, test = train_test_split(df_model, test_size=0.2, random_state=42)
X_train = train.iloc[:,1:]
X_test = test.iloc[:,1:]
Y_train = train.iloc[:,0]
Y_test = test.iloc[:,0]
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)
(95512, 227) (23878, 227) (95512,) (23878,)
mlp_classifier = MLPClassifier(
activation = 'logistic',
random_state = 42
)
X_train = X_train.reset_index(drop=True)
Y_train = Y_train.reset_index(drop=True)
mlp_classifier.fit(X_train, Y_train)
MLPClassifier(activation='logistic', random_state=42)
pred_mlp = mlp_classifier.predict(X_test)
acc_mlp = accuracy_score(pred_mlp, Y_test)
roc_mlp = roc_auc_score(pred_mlp, Y_test)
pre_mlp = precision_score(pred_mlp, Y_test)
f1_mlp = f1_score(pred_mlp, Y_test)
rec_mlp = recall_score(pred_mlp, Y_test)
print('Accuracy Score: ', acc_mlp)
print('AUC Score: ', roc_mlp)
print('Precision Score: ', pre_mlp)
print('F1 Score: ', f1_mlp)
print('Recall Score: ', rec_mlp)
Accuracy Score: 0.832691180165843 AUC Score: 0.8369785147296442 Precision Score: 0.6761787983502396 F1 Score: 0.7522787871271779 Recall Score: 0.8476802683063164
dtc = DecisionTreeClassifier(criterion='entropy', random_state = 42)
dtc.fit(X_train, Y_train)
DecisionTreeClassifier(criterion='entropy', random_state=42)
Y_pred_dtc = dtc.predict(X_test)
acc_dtc = accuracy_score(Y_pred_dtc, Y_test)
roc_dtc = roc_auc_score(Y_pred_dtc, Y_test)
pre_dtc = precision_score(Y_pred_dtc, Y_test)
f1_dtc = f1_score(Y_pred_dtc, Y_test)
rec_dtc = recall_score(Y_pred_dtc, Y_test)
print('Accuracy Score: ', acc_dtc)
print('AUC Score: ', roc_dtc)
print('Precision Score: ', pre_dtc)
print('F1 Score: ', f1_dtc)
print('Recall Score: ', rec_dtc)
Accuracy Score: 0.8620487478013233 AUC Score: 0.8529162917662049 Precision Score: 0.8169657786199978 F1 Score: 0.8165106951871658 Recall Score: 0.8160561184723305
clf = RandomForestClassifier(random_state = 42)
clf.fit(X_train, Y_train)
RandomForestClassifier(random_state=42)
Y_pred_clf = clf.predict(X_test)
acc_clf = accuracy_score(Y_pred_clf, Y_test)
roc_clf = roc_auc_score(Y_pred_clf, Y_test)
pre_clf = precision_score(Y_pred_clf, Y_test)
f1_clf = f1_score(Y_pred_clf, Y_test)
rec_clf = recall_score(Y_pred_clf, Y_test)
print('Accuracy Score: ', acc_clf)
print('AUC Score: ', roc_clf)
print('Precision Score: ', pre_clf)
print('F1 Score: ', f1_clf)
print('Recall Score: ', rec_clf)
Accuracy Score: 0.8950917162241394 AUC Score: 0.8960270243021774 Precision Score: 0.8120610857206555 F1 Score: 0.853294289897511 Recall Score: 0.898938795656466
df_metrics = pd.DataFrame([
['MLP',acc_mlp,roc_mlp,pre_mlp,f1_mlp,rec_mlp],
['Decision Tree',acc_dtc,roc_dtc,pre_dtc,f1_dtc,rec_dtc],
['Random Forest',acc_clf,roc_clf,pre_clf,f1_clf,rec_clf]
],
columns = ['Method','Accuracy Score','AUC Score','Precision Score','F1 Score','Recall Score'])
df_metrics.sort_values(by = 'Accuracy Score', ascending = False, inplace = True)
df_metrics.reset_index(drop = True, inplace = True)
df_metrics
| Method | Accuracy Score | AUC Score | Precision Score | F1 Score | Recall Score | |
|---|---|---|---|---|---|---|
| 0 | Random Forest | 0.895092 | 0.896027 | 0.812061 | 0.853294 | 0.898939 |
| 1 | Decision Tree | 0.862049 | 0.852916 | 0.816966 | 0.816511 | 0.816056 |
| 2 | MLP | 0.832691 | 0.836979 | 0.676179 | 0.752279 | 0.847680 |
Como encontramos que a classificação por Random Forest apresentou melhor resultado, vamos realizar o cross-validation para esse preditor.
Será aplicado também um GridSearch, no intuito de encontrar os hiperparametros que proporcionam o melhor resultado de acordo com a métrica escolhida
scores = cross_val_score(clf, X_train, Y_train, cv=5)
scores
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
0.89 accuracy with a standard deviation of 0.00
Considerando portanto o preditor usando Random Forest Tree, aplicaremos um gridsearch com cross-validation 5 para identificar o modelos cujos hiperparemtros resultam no melhor resultado
clf = RandomForestClassifier()
# Hiperparametros
parametros = {'n_estimators':[10, 50, 100, 200],
'random_state': [42],
'class_weight': ['balanced']}
# Métrica escolhida
score = {'Accuracy':make_scorer(accuracy_score)}
grid_3fold = GridSearchCV(estimator = clf,
param_grid = parametros, # Dicionário com valores para serem testados.
scoring = score, # Foi escolhido a Acurácia como métrica.
refit = 'Accuracy',
cv = 5) # 5-fold cross-validation.
grid_3fold.fit(X_train, Y_train)
GridSearchCV(cv=5, estimator=RandomForestClassifier(),
param_grid={'class_weight': ['balanced'],
'n_estimators': [10, 50, 100, 200],
'random_state': [42]},
refit='Accuracy',
scoring={'Accuracy': make_scorer(accuracy_score)})
pd.DataFrame(grid_3fold.cv_results_)
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_class_weight | param_n_estimators | param_random_state | params | split0_test_Accuracy | split1_test_Accuracy | split2_test_Accuracy | split3_test_Accuracy | split4_test_Accuracy | mean_test_Accuracy | std_test_Accuracy | rank_test_Accuracy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3.645152 | 0.134326 | 0.421392 | 0.026679 | balanced | 10 | 42 | {'class_weight': 'balanced', 'n_estimators': 1... | 0.873999 | 0.871329 | 0.874516 | 0.875196 | 0.872317 | 0.873471 | 0.001433 | 4 |
| 1 | 12.427584 | 0.415930 | 0.684188 | 0.017915 | balanced | 50 | 42 | {'class_weight': 'balanced', 'n_estimators': 5... | 0.885306 | 0.881851 | 0.885928 | 0.887342 | 0.885981 | 0.885281 | 0.001840 | 3 |
| 2 | 24.870695 | 1.018856 | 1.198111 | 0.087982 | balanced | 100 | 42 | {'class_weight': 'balanced', 'n_estimators': 1... | 0.888552 | 0.882898 | 0.887656 | 0.888598 | 0.886923 | 0.886925 | 0.002107 | 2 |
| 3 | 47.220283 | 0.483546 | 1.747651 | 0.059191 | balanced | 200 | 42 | {'class_weight': 'balanced', 'n_estimators': 2... | 0.888866 | 0.882479 | 0.888598 | 0.889174 | 0.887446 | 0.887313 | 0.002486 | 1 |
grid_3fold.best_params_
{'class_weight': 'balanced', 'n_estimators': 200, 'random_state': 42}
grid_3fold.best_score_
0.8873126233398079
Y_best_pred = grid_3fold.predict(X_test)
accuracy_score(Y_best_pred, Y_test)
0.8963899824105872
Temos que a separação temporal tem como objetivo, normalmente, de criar um modelo que consiga prever o comportamento de uma determinada variavel resposta de acordo com o seu comportamento em periodos anteriores, conseguindo até mesmo identificar comportamentos diversos em diferentes sazonalidades. Como por exemplo, a previsão do comportamento de preço de um produto ou de uma ação para um determinado periodo futuro.
Porém, para nesse caso em especifico, a sazonalidade de cancelamentos não é tão concretamente estabelecida, assim como ocorre em casos de operações fraudulentas em cartões de credito, os casos são identificados principalmente relacionado as demais covariaveis/informações sobre o evento.
Sendo assim, é suspeitado que a separação aleatória irá performar melhor nesse caso, uma vez que permitirá o modelo identificar a a relevancia de cada variavel, assim como a do tempo de todo o conjunto disponibilizado para fazer a previsão de futuros clientes. Evitando assim, a previsão errada, de determinados eventos, devido ao fato do ultimo periodo não apresentar comportamento semelhante aos anteriores, por motivos diversos.
Faremos portanto o teste separando o conjunto de dados de treino e teste pelo ano registrado (treino em 2015 e 2016 e teste/validação em 2017)
df_model.year_arrival_date.unique()
array([2015, 2016, 2017], dtype=int64)
mask_2015 = df_model.year_arrival_date == 2015
mask_2016 = df_model.year_arrival_date == 2016
mask_2017 = df_model.year_arrival_date == 2017
train = df_model.loc[mask_2015|mask_2016,:]
test = df_model.loc[mask_2017,:]
X_train = train.iloc[:,1:]
X_test = test.iloc[:,1:]
Y_train = train.iloc[:,0]
Y_test = test.iloc[:,0]
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)
(78703, 227) (40687, 227) (78703,) (40687,)
clf = RandomForestClassifier(random_state = 42)
clf.fit(X_train, Y_train)
RandomForestClassifier(random_state=42)
Y_pred_clf = clf.predict(X_test)
acc_clf = accuracy_score(Y_pred_clf, Y_test)
roc_clf = roc_auc_score(Y_pred_clf, Y_test)
pre_clf = precision_score(Y_pred_clf, Y_test)
f1_clf = f1_score(Y_pred_clf, Y_test)
rec_clf = recall_score(Y_pred_clf, Y_test)
print('Accuracy Score: ', acc_clf)
print('AUC Score: ', roc_clf)
print('Precision Score: ', pre_clf)
print('F1 Score: ', f1_clf)
print('Recall Score: ', rec_clf)
Accuracy Score: 0.7852631061518421 AUC Score: 0.8022471188638124 Precision Score: 0.5527469037789775 F1 Score: 0.6657996404391233 Recall Score: 0.8369878822850548
Confirmando assim, que para esse caso, temos que o modelo treinado com a separação randômica possui resultado superior